--- title: Analyse customers keywords: fastai sidebar: home_sidebar summary: "transactions per user," description: "transactions per user," nb_path: "nbs/04_data_analyse_customers.ipynb" ---
{% raw %}
{% endraw %} {% raw %}
{% endraw %}

Compute transactions per user

number of purchases made by the user, minimum, maximum, average amounts and total amount spent during all the visits

Original section:4.1.3 Consumer Order Combinations. Original cells: 47-49

{% raw %}
from customer_segmentation_toolkit.data.load_split import load_data_csv

DATA = '../data/output/03_data_compute_description_keywords'
with open(f'{DATA}/n_purchase_clusters.txt', 'r') as f:
    N_PURCHASE_CLUSTERS = int(f.read())
logging.info(f'N_PURCHASE_CLUSTERS={N_PURCHASE_CLUSTERS}')

basket_price = load_data_csv(f'{DATA}/no_live_data__cleaned__purchase_clusters__train.csv')
basket_price.head()
INFO:root:N_PURCHASE_CLUSTERS=5
CustomerID InvoiceNo Basket Price categ_0 categ_1 categ_2 categ_3 categ_4 InvoiceDate
0 12347 537626 711.79 293.35 23.40 83.40 124.44 187.2 2010-12-07 14:57:00.000001024
1 12347 542237 475.39 169.20 84.34 53.10 38.25 130.5 2011-01-26 14:29:59.999999744
2 12347 549222 636.25 115.00 81.00 71.10 38.25 330.9 2011-04-07 10:42:59.999999232
3 12347 556201 382.52 168.76 41.40 78.06 19.90 74.4 2011-06-09 13:01:00.000000256
4 12348 539318 892.80 0.00 478.80 0.00 240.00 174.0 2010-12-16 19:09:00.000000000
{% endraw %} {% raw %}

build_transactions_per_user[source]

build_transactions_per_user(basket_price:DataFrame, n_purchase_clusters:int)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
transactions_per_user = build_transactions_per_user(basket_price, n_purchase_clusters=N_PURCHASE_CLUSTERS)
transactions_per_user.head()
CustomerID count min max mean sum categ_0 categ_1 categ_2 categ_3 categ_4 LastPurchase FirstPurchase
0 12347 4 382.52 711.79 551.487500 2205.95 33.831682 10.432693 12.949523 10.011106 32.774995 52 236
1 12348 3 227.44 892.80 495.746667 1487.24 0.000000 45.940131 0.000000 21.516366 32.543503 117 227
2 12350 1 334.40 334.40 334.400000 334.40 0.000000 11.692584 27.900718 11.961722 48.444976 179 179
3 12352 4 144.35 840.30 360.370000 1441.48 5.771846 0.707606 3.683714 78.356966 11.479868 131 165
4 12353 1 89.00 89.00 89.000000 89.00 0.000000 0.000000 19.887640 67.078652 13.033708 73 73
{% endraw %} {% raw %}

compute_n_customers_with_unique_purchase[source]

compute_n_customers_with_unique_purchase(transactions_per_user:DataFrame)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
n1 = compute_n_customers_with_unique_purchase(transactions_per_user)
n2 = transactions_per_user.shape[0]
print("nb. de clients avec achat unique: {:<2}/{:<5} ({:<2.2f}%)".format(n1,n2,n1/n2*100))
nb. de clients avec achat unique: 1358/3143  (43.21%)
{% endraw %}

Analyse customers distribution

PCA over customers

Original section:4.2 Creation of customers categories. Original cells: 50-53

{% raw %}

convert_customers_df_to_np[source]

convert_customers_df_to_np(transactions_per_user:DataFrame, n_purchase_clusters:int)

{% endraw %} {% raw %}

analyse_customers_pca[source]

analyse_customers_pca(matrix:ndarray, n_components=None)

{% endraw %} {% raw %}
{% endraw %} {% raw %}

plot_customers_pca[source]

plot_customers_pca(matrix:ndarray, pca:PCA)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
matrix = convert_customers_df_to_np(transactions_per_user, N_PURCHASE_CLUSTERS)
scaled_matrix, pca = analyse_customers_pca(matrix)
plot_customers_pca(matrix, pca)
2021-05-25T18:50:04.220563 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %}

Analyse customers categories

build customers clusters via Kmeans

Original section:4.2.2 Creation of customer categories. Original cells: 54-

{% raw %}

compute_customer_clusters[source]

compute_customer_clusters(scaled_matrix:ndarray, n_clusters:int)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
N_CUSTOMER_CLUSTERS = 11

clusters_clients = compute_customer_clusters(scaled_matrix, N_CUSTOMER_CLUSTERS)
print(pd.DataFrame(pd.Series(clusters_clients).value_counts(), columns = ['nb. de clients']).T)

silhouette_avg = silhouette_score(scaled_matrix, clusters_clients)
print('score de silhouette: {:<.3f}'.format(silhouette_avg))
                  5    2    3    0    4    1    8    10  7   6   9 
nb. de clients  1184  444  342  273  261  233  207  150  32  10   7
score de silhouette: 0.220
{% endraw %} {% raw %}
from customer_segmentation_toolkit.data.analyse_purchases import plot_silhouette

sample_silhouette_values = silhouette_samples(scaled_matrix, clusters_clients)
plot_silhouette(N_CUSTOMER_CLUSTERS, [-0.15, 0.55], len(scaled_matrix), sample_silhouette_values, clusters_clients)
2021-05-25T18:50:24.066311 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %} {% raw %}

plot_customer_categories[source]

plot_customer_categories(scaled_matrix:ndarray, clusters_clients:ndarray, n_customer_clusters:int)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
plot_customer_categories(scaled_matrix, clusters_clients, N_CUSTOMER_CLUSTERS)
2021-05-25T18:50:25.476617 image/svg+xml Matplotlib v3.4.2, https://matplotlib.org/
{% endraw %}

Original cells 59-61:

{% raw %}

add_customer_clusters_info[source]

add_customer_clusters_info(transactions_per_user:DataFrame, clusters_clients:ndarray)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
selected_customers_df = add_customer_clusters_info(transactions_per_user, clusters_clients)
selected_customers_df
CustomerID count min max mean sum categ_0 categ_1 categ_2 categ_3 categ_4 LastPurchase FirstPurchase cluster
0 12347 4 382.52 711.79 551.487500 2205.95 33.831682 10.432693 12.949523 10.011106 32.774995 52 236 5
1 12348 3 227.44 892.80 495.746667 1487.24 0.000000 45.940131 0.000000 21.516366 32.543503 117 227 8
2 12350 1 334.40 334.40 334.400000 334.40 0.000000 11.692584 27.900718 11.961722 48.444976 179 179 3
3 12352 4 144.35 840.30 360.370000 1441.48 5.771846 0.707606 3.683714 78.356966 11.479868 131 165 2
4 12353 1 89.00 89.00 89.000000 89.00 0.000000 0.000000 19.887640 67.078652 13.033708 73 73 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3138 18273 1 51.00 51.00 51.000000 51.00 0.000000 0.000000 100.000000 0.000000 0.000000 126 126 0
3139 18280 1 180.60 180.60 180.600000 180.60 24.833887 0.000000 41.140642 34.025471 0.000000 146 146 0
3140 18281 1 80.82 80.82 80.820000 80.82 0.000000 18.708241 18.930958 41.945063 20.415739 49 49 5
3141 18283 9 2.50 192.80 105.770000 951.93 9.464982 16.259599 31.669345 2.768061 39.838013 17 206 5
3142 18287 1 765.28 765.28 765.280000 765.28 1.960067 5.315701 17.601401 4.442818 70.680013 70 70 3

3143 rows × 14 columns

{% endraw %} {% raw %}

compute_aggregated_customer_clusters_info[source]

compute_aggregated_customer_clusters_info(selected_customers:DataFrame, n_purchase_clusters:int, n_customer_clusters:int, categ_threshold:int=40)

{% endraw %} {% raw %}
{% endraw %} {% raw %}
merged_df = compute_aggregated_customer_clusters_info(selected_customers_df, N_PURCHASE_CLUSTERS, N_CUSTOMER_CLUSTERS)
print('number of customers:', merged_df['size'].sum())
merged_df
number of customers: 3143
cluster count min max mean sum categ_0 categ_1 categ_2 categ_3 categ_4 LastPurchase FirstPurchase size
0 4.0 1.946360 198.249349 299.845287 245.273822 550.350958 50.355272 6.492344 12.949009 16.205537 14.019682 104.260536 151.831418 261
1 8.0 2.067633 183.609855 313.588357 239.017139 546.267971 6.517860 55.758966 10.144680 8.446232 19.136001 81.009662 134.550725 207
2 0.0 2.102564 196.943810 325.576410 259.460956 583.629670 7.631191 7.254663 56.684704 10.993406 17.436036 98.703297 152.934066 273
3 2.0 2.308559 191.690113 314.013626 245.908891 606.879662 9.635255 4.487470 10.976299 64.461005 10.452680 92.808559 143.659910 444
4 3.0 2.035088 208.900851 293.320822 248.641869 531.040997 6.692807 10.444100 11.491105 9.037000 62.339341 88.570175 134.900585 342
5 5.0 2.895270 207.083024 403.964764 300.380755 888.029949 16.414316 13.804908 21.530722 20.685761 27.569682 73.785473 161.244932 1184
6 1.0 1.815451 730.940300 1078.421592 894.459218 1788.829833 15.952632 12.359545 20.421345 23.560277 27.706201 81.214592 129.785408 233
7 7.0 1.343750 2132.672812 2282.714063 2209.146406 3020.640312 14.200917 19.706945 16.905242 26.299993 22.886903 103.093750 131.156250 32
8 10.0 15.026667 94.115400 1333.988000 508.098442 7474.363067 16.353351 11.784130 22.652679 23.636542 25.593525 17.193333 224.086667 150
9 9.0 72.714286 10.985714 1699.585714 367.013023 26021.358571 12.098634 12.827628 21.749964 26.012978 27.341145 1.285714 241.285714 7
10 6.0 16.600000 443.746000 14426.792000 4792.825838 61938.438000 17.394031 5.140863 28.881243 30.070514 18.513349 25.000000 209.300000 10
{% endraw %} {% raw %}
{% endraw %}